library("tidyverse")
library("table1")
library("here")
raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.protein.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"
if( !dir.exists(raw_dir) ){
dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
download.file(
url = str_c(data_loc, data_file),
destfile = str_c(raw_dir, data_file))
}
raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.survival.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"
if( !dir.exists(raw_dir) ){
dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
download.file(
url = str_c(data_loc, data_file),
destfile = str_c(raw_dir, data_file))
}
raw_dir <- here("data/_raw/")
data_file <- "TCGA-HNSC.clinical.tsv.gz"
data_loc <- "https://gdc-hub.s3.us-east-1.amazonaws.com/download/"
if( !dir.exists(raw_dir) ){
dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
download.file(
url = str_c(data_loc, data_file),
destfile = str_c(raw_dir, data_file))
}
protein_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.protein.tsv.gz"))
survival_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.survival.tsv.gz"))
clinical_data <- read_tsv(file = here("data/_raw/TCGA-HNSC.clinical.tsv.gz"))
meta_data <- inner_join(clinical_data, survival_data, by = c("sample" = "sample"))
data_dir <- here("data/")
write_tsv(x = meta_data,
file = str_c(data_dir,
"01_meta_data_load.tsv.gz"))
# Pivot the peptide expression data longer
peptide_long <- protein_data |>
pivot_longer(cols = -1,
names_to = "sample",
values_to = "expression")
data_dir <- here("data/")
write_tsv( x = peptide_long,
file = str_c(data_dir,
"02_peptide_expression_load.tsv.gz"))
final_data <- inner_join(meta_data,
peptide_long,
by = "sample")
data_dir <- here("data/")
write_tsv( x = final_data,
file = str_c(data_dir,
"03_expression_meta_data_load.tsv.gz"))
# Choosing variables
expression_meta_data_clean_1 <- expression_meta_data_clean |>
select(sample_id = sample,
gender = gender.demographic,
race = race.demographic,
vital_status = vital_status.demographic,
overall_survival = OS,
primary_site,
pathologic_stage = ajcc_pathologic_stage.diagnoses,
age_at_index = age_at_index.demographic,
year_of_birth = year_of_birth.demographic,
year_of_death = year_of_death.demographic,
tissue_source_location = name.tissue_source_site,
cigarettes_per_day = cigarettes_per_day.exposures,
years_smoked = years_smoked.exposures,
alcohol_history = alcohol_history.exposures,
peptide_target,
expression)
# Removing TCGA in sample_id
expression_meta_data_clean_2 <- expression_meta_data_clean_1 |>
mutate(sample_id = sub("^TCGA-(.*)", "\\1", sample_id))
# Creating patient_id from sample_id
expression_meta_data_aug_1 <- expression_meta_data_aug |>
mutate(patient_id = sub("(.*)-..", "\\1", sample_id)) |>
relocate(patient_id, .before = sample_id)
# Creating 5 and 10 years age intervals as variables
expression_meta_data_aug_2 <- expression_meta_data_aug_1 |>
mutate(age_interval_5_year = cut(age_at_index,
breaks = seq(0, 100, by = 5),
labels = paste(seq(0, 95, by = 5),
seq(5, 100, by = 5) - 1,
sep = "-"),
include.lowest = TRUE),
age_interval_10_year = cut(age_at_index,
breaks = seq(0, 100, by = 10),
labels = paste(seq(0, 90, by = 10),
seq(10, 100, by = 10) - 1,
sep = "-"),
include.lowest = TRUE)) |>
# Placing variables after age_at_index variable
relocate(age_interval_5_year, .after = age_at_index) |>
relocate(age_interval_10_year, .after = age_interval_5_year)
# Creating expression mean variable
expression_meta_data_aug_3 <- expression_meta_data_aug_2 |>
group_by(peptide_target) |>
mutate(mean_expression = mean(expression, na.rm = TRUE)) |>
ungroup() # Removing the grouping to ensure that the following work is not messed up
# Creating expression mean variable for each pathologic state
mean_of_expression_pathologic_stages <- expression_meta_data_aug_3 |>
filter(!is.na(pathologic_stage)) |>
group_by(peptide_target, pathologic_stage) |>
summarise(mean_expression = mean(expression, na.rm = TRUE), .groups = "drop") |>
pivot_wider(names_from = pathologic_stage,
values_from = mean_expression,
names_prefix = "mean_expression_")
# Join the calculated means back into the original dataset with the join function
expression_meta_data_aug_4 <- expression_meta_data_aug_3 |>
right_join(mean_of_expression_pathologic_stages, by = "peptide_target")
# Creating binominal variable for alcohol history
expression_meta_data_aug_5 <- expression_meta_data_aug_4 |>
mutate(alcohol_history_binomial = case_when(alcohol_history == "No" ~ 0,
alcohol_history == "Yes" ~ 1)) |>
relocate(alcohol_history_binomial, .after = alcohol_history)